import os
import asyncio
import logging
from pathlib import Path
import aiosqlite
from typing import Optional
import xxhash
import aiofiles
import shutil
import time
from datetime import datetime

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DatabaseMigration:
    def __init__(self, db_path: str):
        self.db_path = db_path
        self.content_paths = self._ensure_content_dirs(os.path.dirname(db_path))
        
    def _ensure_content_dirs(self, base_path: str) -> dict:
        dirs = {
            'html': 'html_content',
            'cleaned': 'cleaned_html',
            'markdown': 'markdown_content', 
            'extracted': 'extracted_content',
            'screenshots': 'screenshots'
        }
        content_paths = {}
        for key, dirname in dirs.items():
            path = os.path.join(base_path, dirname)
            os.makedirs(path, exist_ok=True)
            content_paths[key] = path
        return content_paths

    def _generate_content_hash(self, content: str) -> str:
        x = xxhash.xxh64()
        x.update(content.encode())
        content_hash = x.hexdigest()
        return content_hash
        # return hashlib.sha256(content.encode()).hexdigest()

    async def _store_content(self, content: str, content_type: str) -> str:
        if not content:
            return ""
        
        content_hash = self._generate_content_hash(content)
        file_path = os.path.join(self.content_paths[content_type], content_hash)
        
        if not os.path.exists(file_path):
            async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
                await f.write(content)
                
        return content_hash

    async def migrate_database(self):
        """Migrate existing database to file-based storage"""
        logger.info("Starting database migration...")
        
        try:
            async with aiosqlite.connect(self.db_path) as db:
                # Get all rows
                async with db.execute(
                    '''SELECT url, html, cleaned_html, markdown, 
                       extracted_content, screenshot FROM crawled_data'''
                ) as cursor:
                    rows = await cursor.fetchall()

                migrated_count = 0
                for row in rows:
                    url, html, cleaned_html, markdown, extracted_content, screenshot = row
                    
                    # Store content in files and get hashes
                    html_hash = await self._store_content(html, 'html')
                    cleaned_hash = await self._store_content(cleaned_html, 'cleaned')
                    markdown_hash = await self._store_content(markdown, 'markdown')
                    extracted_hash = await self._store_content(extracted_content, 'extracted')
                    screenshot_hash = await self._store_content(screenshot, 'screenshots')

                    # Update database with hashes
                    await db.execute('''
                        UPDATE crawled_data 
                        SET html = ?, 
                            cleaned_html = ?,
                            markdown = ?,
                            extracted_content = ?,
                            screenshot = ?
                        WHERE url = ?
                    ''', (html_hash, cleaned_hash, markdown_hash, 
                         extracted_hash, screenshot_hash, url))
                    
                    migrated_count += 1
                    if migrated_count % 100 == 0:
                        logger.info(f"Migrated {migrated_count} records...")

                await db.commit()
                logger.info(f"Migration completed. {migrated_count} records processed.")

        except Exception as e:
            logger.error(f"Migration failed: {e}")
            raise

async def backup_database(db_path: str) -> str:
    """Create backup of existing database"""
    if not os.path.exists(db_path):
        logger.info("No existing database found. Skipping backup.")
        return None
        
    # Create backup with timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    backup_path = f"{db_path}.backup_{timestamp}"
    
    try:
        # Wait for any potential write operations to finish
        await asyncio.sleep(1)
        
        # Create backup
        shutil.copy2(db_path, backup_path)
        logger.info(f"Database backup created at: {backup_path}")
        return backup_path
    except Exception as e:
        logger.error(f"Backup failed: {e}")
        raise
    
async def run_migration(db_path: Optional[str] = None):
    """Run database migration"""
    if db_path is None:
        db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db")
    
    if not os.path.exists(db_path):
        logger.info("No existing database found. Skipping migration.")
        return
        
    # Create backup first
    backup_path = await backup_database(db_path)
    if not backup_path:
        return
    
    migration = DatabaseMigration(db_path)
    await migration.migrate_database()
    
def main():
    """CLI entry point for migration"""
    import argparse
    parser = argparse.ArgumentParser(description='Migrate Crawl4AI database to file-based storage')
    parser.add_argument('--db-path', help='Custom database path')
    args = parser.parse_args()
    
    asyncio.run(run_migration(args.db_path))

if __name__ == "__main__":
    main()